Introduction

This IPython notebook illustrates how to remove features from feature table. First, we need to import py_entitymatching package and other libraries as follows:


In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes


In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'

In [3]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

In [4]:
# Get features (for blocking)
feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)
# Get features (for matching)
# feature_table = em.get_features_for_matching(A, B)

Removing Features from Feature Table


In [5]:
type(feature_table)


Out[5]:
pandas.core.frame.DataFrame

In [6]:
feature_table.head()


Out[6]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
0 ID_ID_lev_dist ID ID None None lev_dist <function ID_ID_lev_dist at 0x10b5987b8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
1 ID_ID_lev_sim ID ID None None lev_sim <function ID_ID_lev_sim at 0x10f9b0620> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
2 ID_ID_jar ID ID None None jaro <function ID_ID_jar at 0x10f9b0950> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
3 ID_ID_jwn ID ID None None jaro_winkler <function ID_ID_jwn at 0x10f9b09d8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
4 ID_ID_exm ID ID None None exact_match <function ID_ID_exm at 0x10f9b08c8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True

In [7]:
# Drop first row
feature_table = feature_table.drop(0)

In [8]:
feature_table.head()


Out[8]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
1 ID_ID_lev_sim ID ID None None lev_sim <function ID_ID_lev_sim at 0x10f9b0620> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
2 ID_ID_jar ID ID None None jaro <function ID_ID_jar at 0x10f9b0950> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
3 ID_ID_jwn ID ID None None jaro_winkler <function ID_ID_jwn at 0x10f9b09d8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
4 ID_ID_exm ID ID None None exact_match <function ID_ID_exm at 0x10f9b08c8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
5 ID_ID_jac_qgm_3_qgm_3 ID ID qgm_3 qgm_3 jaccard <function ID_ID_jac_qgm_3_qgm_3 at 0x10f9b0a60> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True

In [9]:
#Remove all the features except involving name (Include only the features where the left attribute is name)
feature_table = feature_table[feature_table.left_attribute=='name']

In [10]:
feature_table


Out[10]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
6 name_name_jac_qgm_3_qgm_3 name name qgm_3 qgm_3 jaccard <function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
7 name_name_cos_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 cosine <function name_name_cos_dlm_dc0_dlm_dc0 at 0x10f9b0b70> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
8 name_name_jac_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 jaccard <function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
9 name_name_mel name name None None monge_elkan <function name_name_mel at 0x10f9b0c80> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
10 name_name_lev_dist name name None None lev_dist <function name_name_lev_dist at 0x10f9b0d08> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
11 name_name_lev_sim name name None None lev_sim <function name_name_lev_sim at 0x10f9b0d90> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
12 name_name_nmw name name None None needleman_wunsch <function name_name_nmw at 0x10f9b0e18> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
13 name_name_sw name name None None smith_waterman <function name_name_sw at 0x10f9b0ea0> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True

In [11]:
#Remove all the features except involving jaccard (Include only the features where the sim function is jaccard)
feature_table = feature_table[feature_table.simfunction=='jaccard']

In [12]:
feature_table


Out[12]:
feature_name left_attribute right_attribute left_attr_tokenizer right_attr_tokenizer simfunction function function_source is_auto_generated
6 name_name_jac_qgm_3_qgm_3 name name qgm_3 qgm_3 jaccard <function name_name_jac_qgm_3_qgm_3 at 0x10f9b0ae8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True
8 name_name_jac_dlm_dc0_dlm_dc0 name name dlm_dc0 dlm_dc0 jaccard <function name_name_jac_dlm_dc0_dlm_dc0 at 0x10f9b0bf8> from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ... True